/*
* Copyright (c) 2014. The Trustees of Indiana University.
*
* This version of the code is licensed under the MPL 2.0 Open Source license with additional
* healthcare disclaimer. If the user is an entity intending to commercialize any application
* that uses this code in a for-profit venture, please contact the copyright holder.
*/
package com.muzima.service;
import com.muzima.utils.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class HTMLConceptParser {
public static final String DATA_CONCEPT_TAG = "data-concept";
public List<String> parse(String html) {
Set<String> concepts = new HashSet<String>();
Document htmlDoc = Jsoup.parse(html);
//Select all elements containing data-concept attr and is not a div.
Elements elements = htmlDoc.select("*:not(div)[" + DATA_CONCEPT_TAG + "]");
for (Element element : elements) {
concepts.add(getConceptName(element.attr(DATA_CONCEPT_TAG)));
}
return new ArrayList<String>(concepts);
}
private static String getConceptName(String conceptName) {
if (!StringUtils.isEmpty(conceptName) && conceptName.split("\\^").length > 1) {
return conceptName.split("\\^")[1];
}
return "";
}
}